{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7.2 Data Transformation（数据变换）\n",
    "\n",
    "# 1 删除重复值\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>k1</th>\n",
       "      <th>k2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>one</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>two</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>one</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>two</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>two</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    k1  k2\n",
       "0  one   1\n",
       "1  two   1\n",
       "2  one   2\n",
       "3  two   3\n",
       "4  one   3\n",
       "5  two   4\n",
       "6  two   4"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],\n",
    "                     'k2': [1, 1, 2, 3, 3, 4, 4]})\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "DataFrame方法duplicated返回的是一个boolean Series，表示一个row是否是重复的（根据前一行来判断）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3    False\n",
       "4    False\n",
       "5    False\n",
       "6     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.duplicated()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "drop_duplicateds返回一个DataFrame，会删除重复的部分："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>k1</th>\n",
       "      <th>k2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>one</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>two</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>one</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>two</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    k1  k2\n",
       "0  one   1\n",
       "1  two   1\n",
       "2  one   2\n",
       "3  two   3\n",
       "4  one   3\n",
       "5  two   4"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上面两种方法都默认考虑所有列；另外，我们可以指定一部分来检测重复值。假设我们只想检测'k1'列的重复值："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>k1</th>\n",
       "      <th>k2</th>\n",
       "      <th>v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>one</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>two</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>one</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>two</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>two</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    k1  k2  v1\n",
       "0  one   1   0\n",
       "1  two   1   1\n",
       "2  one   2   2\n",
       "3  two   3   3\n",
       "4  one   3   4\n",
       "5  two   4   5\n",
       "6  two   4   6"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['v1'] = range(7)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>k1</th>\n",
       "      <th>k2</th>\n",
       "      <th>v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    k1  k2  v1\n",
       "0  one   1   0\n",
       "1  two   1   1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop_duplicates(['k1'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "duplicated和drop_duplicated默认保留第一次观测到的数值组合。设置`keep='last'`能返回最后一个："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>k1</th>\n",
       "      <th>k2</th>\n",
       "      <th>v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>one</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>two</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>one</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>two</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>one</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>two</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    k1  k2  v1\n",
       "0  one   1   0\n",
       "1  two   1   1\n",
       "2  one   2   2\n",
       "3  two   3   3\n",
       "4  one   3   4\n",
       "6  two   4   6"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop_duplicates(['k1', 'k2'], keep='last')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2 Transforming Data Using a Function or Mapping（用函数和映射来转换数据）\n",
    "\n",
    "有时候我们可能希望做一些数据转换。比如下面一个例子，有不同种类的肉："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>food</th>\n",
       "      <th>ounces</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bacon</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>pulled pork</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bacon</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Pastrami</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>corned beef</td>\n",
       "      <td>7.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Bacon</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>pastrami</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>honey ham</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>nova lox</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          food  ounces\n",
       "0        bacon     4.0\n",
       "1  pulled pork     3.0\n",
       "2        bacon    12.0\n",
       "3     Pastrami     6.0\n",
       "4  corned beef     7.5\n",
       "5        Bacon     8.0\n",
       "6     pastrami     3.0\n",
       "7    honey ham     5.0\n",
       "8     nova lox     6.0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',\n",
    "                              'Pastrami', 'corned beef', 'Bacon',\n",
    "                              'pastrami', 'honey ham', 'nova lox'],\n",
    "                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "假设你想加一列，表明每种肉来源的动物是什么。我们可以写一个映射："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "meat_to_animal = {\n",
    "    'bacon': 'pig',\n",
    "    'pulled pork': 'pig',\n",
    "    'pastrami': 'cow',\n",
    "    'corned beef': 'cow',\n",
    "    'honey ham': 'pig',\n",
    "    'nova lox': 'salmon'\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用于series的map方法接受一个函数，或是一个字典，包含着映射关系，但这里有一个小问题，有些肉是大写，有些是小写。因此，我们先用str.lower把所有的值变为小写:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          bacon\n",
       "1    pulled pork\n",
       "2          bacon\n",
       "3       pastrami\n",
       "4    corned beef\n",
       "5          bacon\n",
       "6       pastrami\n",
       "7      honey ham\n",
       "8       nova lox\n",
       "Name: food, dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lowercased = data['food'].str.lower()\n",
    "lowercased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>food</th>\n",
       "      <th>ounces</th>\n",
       "      <th>animal</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bacon</td>\n",
       "      <td>4.0</td>\n",
       "      <td>pig</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>pulled pork</td>\n",
       "      <td>3.0</td>\n",
       "      <td>pig</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bacon</td>\n",
       "      <td>12.0</td>\n",
       "      <td>pig</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Pastrami</td>\n",
       "      <td>6.0</td>\n",
       "      <td>cow</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>corned beef</td>\n",
       "      <td>7.5</td>\n",
       "      <td>cow</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Bacon</td>\n",
       "      <td>8.0</td>\n",
       "      <td>pig</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>pastrami</td>\n",
       "      <td>3.0</td>\n",
       "      <td>cow</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>honey ham</td>\n",
       "      <td>5.0</td>\n",
       "      <td>pig</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>nova lox</td>\n",
       "      <td>6.0</td>\n",
       "      <td>salmon</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          food  ounces  animal\n",
       "0        bacon     4.0     pig\n",
       "1  pulled pork     3.0     pig\n",
       "2        bacon    12.0     pig\n",
       "3     Pastrami     6.0     cow\n",
       "4  corned beef     7.5     cow\n",
       "5        Bacon     8.0     pig\n",
       "6     pastrami     3.0     cow\n",
       "7    honey ham     5.0     pig\n",
       "8     nova lox     6.0  salmon"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['animal'] = lowercased.map(meat_to_animal)\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们也可以用一个函数解决上面的问题："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       pig\n",
       "1       pig\n",
       "2       pig\n",
       "3       cow\n",
       "4       cow\n",
       "5       pig\n",
       "6       cow\n",
       "7       pig\n",
       "8    salmon\n",
       "Name: food, dtype: object"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['food'].map(lambda x: meat_to_animal[x.lower()])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用map是一个很简便的方法，用于element-wise转换和其他一些数据清洗操作。\n",
    "\n",
    "# 3 Replacing Values（替换值）\n",
    "\n",
    "其实fillna是一个特殊换的替换操作。map可以用于修改一个object里的部分值，但是replace能提供一个更简单和更灵活的方法做到这点。下面是一个series："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       1.0\n",
       "1    -999.0\n",
       "2       2.0\n",
       "3    -999.0\n",
       "4   -1000.0\n",
       "5       3.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里-999可能是用来表示缺失值的标识符。用NA来替代的话，用replace，会产生一个新series（除非使用inplace=True）:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       1.0\n",
       "1       NaN\n",
       "2       2.0\n",
       "3       NaN\n",
       "4   -1000.0\n",
       "5       3.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.replace(-999, np.nan)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果想要一次替换多个值，直接用一个list即可："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    NaN\n",
       "2    2.0\n",
       "3    NaN\n",
       "4    NaN\n",
       "5    3.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.replace([-999, -1000], np.nan)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对于不同的值用不同的替换值，也是导入一个list："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    NaN\n",
       "2    2.0\n",
       "3    NaN\n",
       "4    0.0\n",
       "5    3.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.replace([-999, -1000], [np.nan, 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "参数也可以是一个dict："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    NaN\n",
       "2    2.0\n",
       "3    NaN\n",
       "4    0.0\n",
       "5    3.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.replace({-999: np.nan, -1000: 0})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注意：data.replace方法和data.str.replace方法是不同的，后者会对string进行element-wise替换。\n",
    "\n",
    "# 4 Renaming Axis Indexes（重命名Axis Indexes）\n",
    "\n",
    "\n",
    "像是series里的value一样，axis label也能类似地是函数或映射来转换，产生一个新的object。当然也可以设置in-place不产生新的数据："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "      <th>four</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Ohio</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Colorado</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New York</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          one  two  three  four\n",
       "Ohio        0    1      2     3\n",
       "Colorado    4    5      6     7\n",
       "New York    8    9     10    11"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n",
    "                    index=['Ohio', 'Colorado', 'New York'],\n",
    "                    columns=['one', 'two', 'three', 'four'])\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "与series相同，axis index有一个map方法："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<function __main__.<lambda>>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transform = lambda x: x[:4].upper()\n",
    "transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Ohio', 'Colorado', 'New York'], dtype='object')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['OHIO', 'COLO', 'NEW '], dtype=object)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.index.map(transform)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以赋值给index，以in-place的方式修改DataFrame："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "      <th>four</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>OHIO</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COLO</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NEW</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      one  two  three  four\n",
       "OHIO    0    1      2     3\n",
       "COLO    4    5      6     7\n",
       "NEW     8    9     10    11"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.index = data.index.map(transform)\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果你想要创建一个转换后的版本，而且不用修改原始的数据，可以用rename:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ONE</th>\n",
       "      <th>TWO</th>\n",
       "      <th>THREE</th>\n",
       "      <th>FOUR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Ohio</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Colo</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      ONE  TWO  THREE  FOUR\n",
       "Ohio    0    1      2     3\n",
       "Colo    4    5      6     7\n",
       "New     8    9     10    11"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.rename(index=str.title, columns=str.upper)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注意，rename能用于dict一样的oject，"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>pekaboo</th>\n",
       "      <th>four</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>INDIANA</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COLO</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NEW</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         one  two  pekaboo  four\n",
       "INDIANA    0    1        2     3\n",
       "COLO       4    5        6     7\n",
       "NEW        8    9       10    11"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.rename(index={'OHIO': 'INDIANA'},\n",
    "            columns={'three': 'pekaboo'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "rename能让你避免陷入手动赋值给index和columns的杂务中。可以用inplace直接修改原始数据："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "      <th>four</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>INDIANA</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>COLO</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NEW</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         one  two  three  four\n",
       "INDIANA    0    1      2     3\n",
       "COLO       4    5      6     7\n",
       "NEW        8    9     10    11"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.rename(index={'OHIO': 'INDIANA'}, inplace=True)\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5 Discretization and Binning（离散化和装箱）\n",
    "\n",
    "\n",
    "连续型数据经常被离散化或分散成bins（分箱）来分析。假设你有一组数据，你想把人分到不同的年龄组里："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们把这些分到四个bin里，19~25, 26~35, 36~60, >60。可以用pandas里的cut："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]\n",
       "Length: 12\n",
       "Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bins = [18, 25, 35, 60, 100]\n",
    "\n",
    "cats = pd.cut(ages, bins)\n",
    "\n",
    "cats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回的是一个特殊的Categorical object。我们看到的结果描述了pandas.cut如何得到bins。可以看作是一个string数组用来表示bin的名字，它内部包含了一个categories数组，用来记录不同类别的名字，并伴有表示ages的label（可以通过codes属性查看）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cats.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cats.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18, 25]     5\n",
       "(35, 60]     3\n",
       "(25, 35]     3\n",
       "(60, 100]    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里pd.value_counts(cats)是pandas.cut后bin的数量。\n",
    "\n",
    "这里我们注意一下区间。括号表示不包含，方括号表示包含。你可以自己设定哪一边关闭（right=False）:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]\n",
       "Length: 12\n",
       "Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(ages, [18, 26, 36, 61, 100], right=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "你也可以用一个list或数组给labels选项来设定bin的名字："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]\n",
       "Length: 12\n",
       "Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(ages, bins, labels=group_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果你只是给一个bins的数量来cut，而不是自己设定每个bind的范围，cut会根据最大值和最小值来计算等长的bins。比如下面我们想要做一个均匀分布的四个bins："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = np.random.rand(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0.77, 0.98], (0.33, 0.55], (0.77, 0.98], (0.55, 0.77], (0.55, 0.77], ..., (0.77, 0.98], (0.11, 0.33], (0.11, 0.33], (0.33, 0.55], (0.11, 0.33]]\n",
       "Length: 20\n",
       "Categories (4, object): [(0.11, 0.33] < (0.33, 0.55] < (0.55, 0.77] < (0.77, 0.98]]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(data, 4, precision=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "precision=2选项表示精确到小数点后两位。\n",
    "\n",
    "一个近似的函数，qcut，会按照数据的分位数来分箱。取决于数据的分布，用cut通常不能保证每一个bin有一个相同数量的数据点。而qcut是按百分比来切的，所以可以得到等数量的bins："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = np.random.randn(1000) # Normally distributed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(-0.717, -0.0981], (-0.717, -0.0981], (-0.0981, 0.639], (0.639, 3.434], [-2.86, -0.717], ..., (-0.0981, 0.639], (-0.717, -0.0981], (-0.0981, 0.639], (0.639, 3.434], (-0.0981, 0.639]]\n",
       "Length: 1000\n",
       "Categories (4, object): [[-2.86, -0.717] < (-0.717, -0.0981] < (-0.0981, 0.639] < (0.639, 3.434]]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cats = pd.qcut(data, 4) # Cut into quartiles\n",
    "cats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.639, 3.434]       250\n",
       "(-0.0981, 0.639]     250\n",
       "(-0.717, -0.0981]    250\n",
       "[-2.86, -0.717]      250\n",
       "dtype: int64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "类似的，在cut中我们可以自己指定百分比："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[NaN, NaN, (0.1, 0.5], NaN, NaN, ..., (0.1, 0.5], NaN, (0.5, 0.9], NaN, (0.5, 0.9]]\n",
       "Length: 1000\n",
       "Categories (4, object): [(0, 0.1] < (0.1, 0.5] < (0.5, 0.9] < (0.9, 1]]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cats2 = pd.cut(data, [0, 0.1, 0.5, 0.9, 1.]) # 累进的百分比\n",
    "cats2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.1, 0.5]    135\n",
       "(0.5, 0.9]    124\n",
       "(0, 0.1]       40\n",
       "(0.9, 1]       21\n",
       "dtype: int64"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cats2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在之后的章节我们还会用到cut和qcut，这些离散函数对于量化和群聚分析很有用。\n",
    "\n",
    "# 6 Detecting and Filtering Outliers（检测和过滤异常值）\n",
    "\n",
    "过滤或转换异常值是数组操作的一个重头戏。下面的DataFrame有正态分布的数据："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.010953</td>\n",
       "      <td>0.012928</td>\n",
       "      <td>0.033165</td>\n",
       "      <td>-0.031257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.011621</td>\n",
       "      <td>1.013341</td>\n",
       "      <td>1.004356</td>\n",
       "      <td>0.996333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-2.994342</td>\n",
       "      <td>-4.328036</td>\n",
       "      <td>-3.303616</td>\n",
       "      <td>-3.133495</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>-0.654483</td>\n",
       "      <td>-0.662177</td>\n",
       "      <td>-0.644982</td>\n",
       "      <td>-0.670813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>-0.000637</td>\n",
       "      <td>-0.033241</td>\n",
       "      <td>0.050481</td>\n",
       "      <td>-0.074641</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.723100</td>\n",
       "      <td>0.725839</td>\n",
       "      <td>0.708452</td>\n",
       "      <td>0.643418</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>3.318499</td>\n",
       "      <td>3.353001</td>\n",
       "      <td>3.002853</td>\n",
       "      <td>3.002868</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 0            1            2            3\n",
       "count  1000.000000  1000.000000  1000.000000  1000.000000\n",
       "mean      0.010953     0.012928     0.033165    -0.031257\n",
       "std       1.011621     1.013341     1.004356     0.996333\n",
       "min      -2.994342    -4.328036    -3.303616    -3.133495\n",
       "25%      -0.654483    -0.662177    -0.644982    -0.670813\n",
       "50%      -0.000637    -0.033241     0.050481    -0.074641\n",
       "75%       0.723100     0.725839     0.708452     0.643418\n",
       "max       3.318499     3.353001     3.002853     3.002868"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame(np.random.randn(1000, 4))\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "假设我们想要找一个列中，绝对值大于3的数字："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.123766</td>\n",
       "      <td>0.933920</td>\n",
       "      <td>0.494755</td>\n",
       "      <td>0.690507</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.513636</td>\n",
       "      <td>0.575393</td>\n",
       "      <td>-0.323590</td>\n",
       "      <td>0.586833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.335958</td>\n",
       "      <td>-0.843735</td>\n",
       "      <td>0.302201</td>\n",
       "      <td>-0.490675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.307658</td>\n",
       "      <td>-0.485670</td>\n",
       "      <td>1.612787</td>\n",
       "      <td>0.210169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.793757</td>\n",
       "      <td>-0.693757</td>\n",
       "      <td>-1.718367</td>\n",
       "      <td>0.515088</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1         2         3\n",
       "0  1.123766  0.933920  0.494755  0.690507\n",
       "1  2.513636  0.575393 -0.323590  0.586833\n",
       "2 -0.335958 -0.843735  0.302201 -0.490675\n",
       "3 -1.307658 -0.485670  1.612787  0.210169\n",
       "4 -0.793757 -0.693757 -1.718367  0.515088"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.494755\n",
       "1   -0.323590\n",
       "2    0.302201\n",
       "3    1.612787\n",
       "4   -1.718367\n",
       "Name: 2, dtype: float64"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "col = data[2]\n",
    "col.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "339   -3.303616\n",
       "932    3.002853\n",
       "Name: 2, dtype: float64"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "col[np.abs(col) > 3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "选中所有绝对值大于3的行，可以用any方法在一个boolean DataFrame上："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "0 NaN NaN NaN NaN\n",
       "1 NaN NaN NaN NaN\n",
       "2 NaN NaN NaN NaN\n",
       "3 NaN NaN NaN NaN\n",
       "4 NaN NaN NaN NaN"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[(np.abs(data) > 3)].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1.075728</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.951303</td>\n",
       "      <td>-3.133495</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>0.064837</td>\n",
       "      <td>-4.328036</td>\n",
       "      <td>1.121061</td>\n",
       "      <td>-0.574203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224</th>\n",
       "      <td>-0.289148</td>\n",
       "      <td>-2.912116</td>\n",
       "      <td>0.332218</td>\n",
       "      <td>-3.129604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>339</th>\n",
       "      <td>-0.098352</td>\n",
       "      <td>-0.610929</td>\n",
       "      <td>-3.303616</td>\n",
       "      <td>-2.072304</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>609</th>\n",
       "      <td>0.983240</td>\n",
       "      <td>1.372633</td>\n",
       "      <td>0.018172</td>\n",
       "      <td>3.002868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>735</th>\n",
       "      <td>3.318499</td>\n",
       "      <td>-2.573122</td>\n",
       "      <td>-1.515901</td>\n",
       "      <td>-1.204596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>1.810396</td>\n",
       "      <td>3.353001</td>\n",
       "      <td>-1.283856</td>\n",
       "      <td>-1.166749</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>856</th>\n",
       "      <td>-0.795070</td>\n",
       "      <td>3.204789</td>\n",
       "      <td>-0.211642</td>\n",
       "      <td>1.278828</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>932</th>\n",
       "      <td>0.898147</td>\n",
       "      <td>-0.961850</td>\n",
       "      <td>3.002853</td>\n",
       "      <td>-0.128494</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0         1         2         3\n",
       "22   1.075728  0.250000  0.951303 -3.133495\n",
       "155  0.064837 -4.328036  1.121061 -0.574203\n",
       "224 -0.289148 -2.912116  0.332218 -3.129604\n",
       "339 -0.098352 -0.610929 -3.303616 -2.072304\n",
       "609  0.983240  1.372633  0.018172  3.002868\n",
       "735  3.318499 -2.573122 -1.515901 -1.204596\n",
       "822  1.810396  3.353001 -1.283856 -1.166749\n",
       "856 -0.795070  3.204789 -0.211642  1.278828\n",
       "932  0.898147 -0.961850  3.002853 -0.128494"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[(np.abs(data) > 3).any(1)] # any中axis=1表示column"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "下面是把绝对值大于3的数字直接变成-3或3："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data[np.abs(data) > 3] = np.sign(data) * 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>-0.066111</td>\n",
       "      <td>-1.159064</td>\n",
       "      <td>0.518720</td>\n",
       "      <td>-0.284596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1.075728</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.951303</td>\n",
       "      <td>-3.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           0         1         2         3\n",
       "21 -0.066111 -1.159064  0.518720 -0.284596\n",
       "22  1.075728  0.250000  0.951303 -3.000000"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[21:23]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.010634</td>\n",
       "      <td>0.013698</td>\n",
       "      <td>0.033466</td>\n",
       "      <td>-0.030997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.010629</td>\n",
       "      <td>1.006768</td>\n",
       "      <td>1.003383</td>\n",
       "      <td>0.995521</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-2.994342</td>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-3.000000</td>\n",
       "      <td>-3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>-0.654483</td>\n",
       "      <td>-0.662177</td>\n",
       "      <td>-0.644982</td>\n",
       "      <td>-0.670813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>-0.000637</td>\n",
       "      <td>-0.033241</td>\n",
       "      <td>0.050481</td>\n",
       "      <td>-0.074641</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.723100</td>\n",
       "      <td>0.725839</td>\n",
       "      <td>0.708452</td>\n",
       "      <td>0.643418</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 0            1            2            3\n",
       "count  1000.000000  1000.000000  1000.000000  1000.000000\n",
       "mean      0.010634     0.013698     0.033466    -0.030997\n",
       "std       1.010629     1.006768     1.003383     0.995521\n",
       "min      -2.994342    -3.000000    -3.000000    -3.000000\n",
       "25%      -0.654483    -0.662177    -0.644982    -0.670813\n",
       "50%      -0.000637    -0.033241     0.050481    -0.074641\n",
       "75%       0.723100     0.725839     0.708452     0.643418\n",
       "max       3.000000     3.000000     3.000000     3.000000"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "np.sign(data)会根据值的正负号来得到1或-1："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3\n",
       "0  1.0  1.0  1.0  1.0\n",
       "1  1.0  1.0 -1.0  1.0\n",
       "2 -1.0 -1.0  1.0 -1.0\n",
       "3 -1.0 -1.0  1.0  1.0\n",
       "4 -1.0 -1.0 -1.0  1.0"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sign(data).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7 Permutation and Random Sampling（排列和随机采样）\n",
    "\n",
    "排列（随机排序）一个series或DataFrame中的row，用numpy.random.permutation函数很容易就能做到。调用permutation的时候设定好你想要进行排列的axis，会产生一个整数数组表示新的顺序："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "0   0   1   2   3\n",
       "1   4   5   6   7\n",
       "2   8   9  10  11\n",
       "3  12  13  14  15\n",
       "4  16  17  18  19"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4, 3, 2, 1, 0])"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sampler = np.random.permutation(5)\n",
    "sampler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这个数组能被用在基于iloc上的indexing或take函数："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "4  16  17  18  19\n",
       "3  12  13  14  15\n",
       "2   8   9  10  11\n",
       "1   4   5   6   7\n",
       "0   0   1   2   3"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.take(sampler)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为了选中一个随机的子集，而且没有代替功能(既不影响原来的值，返回一个新的series或DataFrame)，可以用sample方法："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>13</td>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>16</td>\n",
       "      <td>17</td>\n",
       "      <td>18</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3\n",
       "0   0   1   2   3\n",
       "3  12  13  14  15\n",
       "4  16  17  18  19"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(n=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果想要生成的样本带有替代功能(即允许重复)，给sample中设定replace=True:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4    4\n",
       "4    4\n",
       "1    7\n",
       "1    7\n",
       "1    7\n",
       "1    7\n",
       "4    4\n",
       "3    6\n",
       "1    7\n",
       "1    7\n",
       "dtype: int64"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "choices = pd.Series([5, 7, -1, 6, 4])\n",
    "\n",
    "draws = choices.sample(n=10, replace=True)\n",
    "\n",
    "draws"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 8 Computing Indicator/Dummy Variables（计算指示器/虚拟变量）\n",
    "\n",
    "> Dummy Variables：虚拟变量，又称虚设变量、名义变量或哑变量,用以反映质的属性的一个人工变量,是量化了的自变量,通常取值为0或1。\n",
    "\n",
    "另一种在统计模型上的转换或机器学习应用是把一个categorical variable(类别变量)变为一个dummy or indicator matrix（虚拟或指示器矩阵）。如果DataFrame中的一列有k个不同的值，我们可以用一个矩阵或DataFrame用k列来表示，1或0。pandas有一个get_dummies函数实现这个工作，当然，你自己设计一个其实也不难。这里举个例子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>key</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   data1 key\n",
       "0      0   b\n",
       "1      1   b\n",
       "2      2   a\n",
       "3      3   c\n",
       "4      4   a\n",
       "5      5   b"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n",
    "                   'data1': range(6)})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a    b    c\n",
       "0  0.0  1.0  0.0\n",
       "1  0.0  1.0  0.0\n",
       "2  1.0  0.0  0.0\n",
       "3  0.0  0.0  1.0\n",
       "4  1.0  0.0  0.0\n",
       "5  0.0  1.0  0.0"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_dummies(df['key'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在一些情况里，如果我们想要给column加一个prefix， 可以用data.get_dummies里的prefix参数来实现："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dummies = pd.get_dummies(df['key'], prefix='key')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>key_a</th>\n",
       "      <th>key_b</th>\n",
       "      <th>key_c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   data1  key_a  key_b  key_c\n",
       "0      0    0.0    1.0    0.0\n",
       "1      1    0.0    1.0    0.0\n",
       "2      2    1.0    0.0    0.0\n",
       "3      3    0.0    0.0    1.0\n",
       "4      4    1.0    0.0    0.0\n",
       "5      5    0.0    1.0    0.0"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_with_dummy = df[['data1']].join(dummies)\n",
    "df_with_dummy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果DataFrame中的a row属于多个类别，事情会变得复杂一些。我们来看一下MoviesLens 1M 数据集："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mnames = ['movie_id', 'title', 'genres']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Animation|Children's|Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children's|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>Sabrina (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>Tom and Huck (1995)</td>\n",
       "      <td>Adventure|Children's</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>Sudden Death (1995)</td>\n",
       "      <td>Action</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>GoldenEye (1995)</td>\n",
       "      <td>Action|Adventure|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id                               title                        genres\n",
       "0         1                    Toy Story (1995)   Animation|Children's|Comedy\n",
       "1         2                      Jumanji (1995)  Adventure|Children's|Fantasy\n",
       "2         3             Grumpier Old Men (1995)                Comedy|Romance\n",
       "3         4            Waiting to Exhale (1995)                  Comedy|Drama\n",
       "4         5  Father of the Bride Part II (1995)                        Comedy\n",
       "5         6                         Heat (1995)         Action|Crime|Thriller\n",
       "6         7                      Sabrina (1995)                Comedy|Romance\n",
       "7         8                 Tom and Huck (1995)          Adventure|Children's\n",
       "8         9                 Sudden Death (1995)                        Action\n",
       "9        10                    GoldenEye (1995)     Action|Adventure|Thriller"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies = pd.read_table('../datasets/movielens/movies.dat', sep='::',\n",
    "                       header=None, names=mnames, engine='python')\n",
    "movies[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "给每个genre添加一个指示变量比较麻烦。首先我们先取出所有不同的类别："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Animation', \"Children's\", 'Comedy', 'Adventure', 'Fantasy',\n",
       "       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',\n",
       "       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',\n",
       "       'Western'], dtype=object)"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_genres = []\n",
    "\n",
    "for x in movies.genres:\n",
    "    all_genres.extend(x.split('|'))\n",
    "    \n",
    "genres = pd.unique(all_genres)\n",
    "genres"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "一种构建indicator dataframe的方法是先构建一个全是0的DataFrame："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3883, 18)"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zero_matrix = np.zeros((len(movies), len(genres)))\n",
    "\n",
    "zero_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Animation</th>\n",
       "      <th>Children's</th>\n",
       "      <th>Comedy</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Fantasy</th>\n",
       "      <th>Romance</th>\n",
       "      <th>Drama</th>\n",
       "      <th>Action</th>\n",
       "      <th>Crime</th>\n",
       "      <th>Thriller</th>\n",
       "      <th>Horror</th>\n",
       "      <th>Sci-Fi</th>\n",
       "      <th>Documentary</th>\n",
       "      <th>War</th>\n",
       "      <th>Musical</th>\n",
       "      <th>Mystery</th>\n",
       "      <th>Film-Noir</th>\n",
       "      <th>Western</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Animation  Children's  Comedy  Adventure  Fantasy  Romance  Drama  Action  \\\n",
       "0        0.0         0.0     0.0        0.0      0.0      0.0    0.0     0.0   \n",
       "1        0.0         0.0     0.0        0.0      0.0      0.0    0.0     0.0   \n",
       "2        0.0         0.0     0.0        0.0      0.0      0.0    0.0     0.0   \n",
       "3        0.0         0.0     0.0        0.0      0.0      0.0    0.0     0.0   \n",
       "4        0.0         0.0     0.0        0.0      0.0      0.0    0.0     0.0   \n",
       "\n",
       "   Crime  Thriller  Horror  Sci-Fi  Documentary  War  Musical  Mystery  \\\n",
       "0    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "1    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "2    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "3    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "4    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "\n",
       "   Film-Noir  Western  \n",
       "0        0.0      0.0  \n",
       "1        0.0      0.0  \n",
       "2        0.0      0.0  \n",
       "3        0.0      0.0  \n",
       "4        0.0      0.0  "
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dummies = pd.DataFrame(zero_matrix, columns=genres)\n",
    "dummies.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后迭代每一部movie，并设置每一行中的dummies为1。使用dummies.columns来计算每一列的genre的指示器："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Animation', \"Children's\", 'Comedy']"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen = movies.genres[0]\n",
    "gen.split('|')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2])"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dummies.columns.get_indexer(gen.split('|'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后，使用.iloc，根据索引来设定值："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i, gen in enumerate(movies.genres):\n",
    "    indices = dummies.columns.get_indexer(gen.split('|'))\n",
    "    dummies.iloc[i, indices] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Animation</th>\n",
       "      <th>Children's</th>\n",
       "      <th>Comedy</th>\n",
       "      <th>Adventure</th>\n",
       "      <th>Fantasy</th>\n",
       "      <th>Romance</th>\n",
       "      <th>Drama</th>\n",
       "      <th>Action</th>\n",
       "      <th>Crime</th>\n",
       "      <th>Thriller</th>\n",
       "      <th>Horror</th>\n",
       "      <th>Sci-Fi</th>\n",
       "      <th>Documentary</th>\n",
       "      <th>War</th>\n",
       "      <th>Musical</th>\n",
       "      <th>Mystery</th>\n",
       "      <th>Film-Noir</th>\n",
       "      <th>Western</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Animation  Children's  Comedy  Adventure  Fantasy  Romance  Drama  Action  \\\n",
       "0        1.0         1.0     1.0        0.0      0.0      0.0    0.0     0.0   \n",
       "1        0.0         1.0     0.0        1.0      1.0      0.0    0.0     0.0   \n",
       "2        0.0         0.0     1.0        0.0      0.0      1.0    0.0     0.0   \n",
       "3        0.0         0.0     1.0        0.0      0.0      0.0    1.0     0.0   \n",
       "4        0.0         0.0     1.0        0.0      0.0      0.0    0.0     0.0   \n",
       "\n",
       "   Crime  Thriller  Horror  Sci-Fi  Documentary  War  Musical  Mystery  \\\n",
       "0    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "1    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "2    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "3    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "4    0.0       0.0     0.0     0.0          0.0  0.0      0.0      0.0   \n",
       "\n",
       "   Film-Noir  Western  \n",
       "0        0.0      0.0  \n",
       "1        0.0      0.0  \n",
       "2        0.0      0.0  \n",
       "3        0.0      0.0  \n",
       "4        0.0      0.0  "
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dummies.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后，我们可以结合这个和movies:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "movie_id                                       1\n",
       "title                           Toy Story (1995)\n",
       "genres               Animation|Children's|Comedy\n",
       "Genre_Animation                                1\n",
       "Genre_Children's                               1\n",
       "Genre_Comedy                                   1\n",
       "Genre_Adventure                                0\n",
       "Genre_Fantasy                                  0\n",
       "Genre_Romance                                  0\n",
       "Genre_Drama                                    0\n",
       "Genre_Action                                   0\n",
       "Genre_Crime                                    0\n",
       "Genre_Thriller                                 0\n",
       "Genre_Horror                                   0\n",
       "Genre_Sci-Fi                                   0\n",
       "Genre_Documentary                              0\n",
       "Genre_War                                      0\n",
       "Genre_Musical                                  0\n",
       "Genre_Mystery                                  0\n",
       "Genre_Film-Noir                                0\n",
       "Genre_Western                                  0\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_windic = movies.join(dummies.add_prefix('Genre_'))\n",
    "movies_windic.iloc[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对于一个很大的数据集，这种构建多个成员指示变量的方法并不会加快速度。写一个低层级的函数来直接写一个numpy array，并把写过整合到DataFrame会更快一些。\n",
    "\n",
    "一个有用的recipe诀窍是把get_dummies和离散函数（比如cut）结合起来："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.random.seed(12345)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.92961609,  0.31637555,  0.18391881,  0.20456028,  0.56772503,\n",
       "        0.5955447 ,  0.96451452,  0.6531771 ,  0.74890664,  0.65356987])"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values = np.random.rand(10)\n",
    "values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0.8, 1], (0.2, 0.4], (0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]\n",
       "Categories (5, object): [(0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1]]"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(values, bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>(0, 0.2]</th>\n",
       "      <th>(0.2, 0.4]</th>\n",
       "      <th>(0.4, 0.6]</th>\n",
       "      <th>(0.6, 0.8]</th>\n",
       "      <th>(0.8, 1]</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   (0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1]\n",
       "0       0.0         0.0         0.0         0.0       1.0\n",
       "1       0.0         1.0         0.0         0.0       0.0\n",
       "2       1.0         0.0         0.0         0.0       0.0\n",
       "3       0.0         1.0         0.0         0.0       0.0\n",
       "4       0.0         0.0         1.0         0.0       0.0\n",
       "5       0.0         0.0         1.0         0.0       0.0\n",
       "6       0.0         0.0         0.0         0.0       1.0\n",
       "7       0.0         0.0         0.0         1.0       0.0\n",
       "8       0.0         0.0         0.0         1.0       0.0\n",
       "9       0.0         0.0         0.0         1.0       0.0"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_dummies(pd.cut(values, bins))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [py35]",
   "language": "python",
   "name": "Python [py35]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
